R Markdown
# Load the necessary libraries
library(tm)
## Loading required package: NLP
library(SnowballC)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
library(syuzhet)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:syuzhet':
##
## rescale
library(reshape2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 3.6.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
#Read President George Walker Bush's Inaugural speeches
BOtext<- readLines('C:/Users/skimitei/Desktop/r_code/BO.txt')
# Load the data speech as a corpus
BO <- Corpus(VectorSource(BOtext))
#############Text transformation - Text wrangling in Text Mining#############
#----------------------------------------------------------------------------
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
BO<- tm_map(BO, toSpace, "/")
## Warning in tm_map.SimpleCorpus(BO, toSpace, "/"): transformation drops documents
BO<- tm_map(BO, toSpace, "@")
## Warning in tm_map.SimpleCorpus(BO, toSpace, "@"): transformation drops documents
# Convert the text to lower case
BO <- tm_map(BO, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(BO, content_transformer(tolower)): transformation
## drops documents
# Remove numbers
BO <- tm_map(BO, removeNumbers)
## Warning in tm_map.SimpleCorpus(BO, removeNumbers): transformation drops
## documents
# Remove english common stopwords
BO <- tm_map(BO, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(BO, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
BO<- tm_map(BO, removeWords, c("blabla1", "blabla2"))
## Warning in tm_map.SimpleCorpus(BO, removeWords, c("blabla1", "blabla2")):
## transformation drops documents
# Remove punctuations
BO <- tm_map(BO, removePunctuation)
## Warning in tm_map.SimpleCorpus(BO, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
BO <- tm_map(BO, stripWhitespace)
## Warning in tm_map.SimpleCorpus(BO, stripWhitespace): transformation drops
## documents
# Text stemming
# docs <- tm_map(docs, stemDocument)
BO <- tm_map(BO, toSpace, "\\|")
## Warning in tm_map.SimpleCorpus(BO, toSpace, "\\|"): transformation drops
## documents
# Build a term-document matrix of President George Bush Jr's inaugural speeches
# and display the top ten most used words
dtm <- TermDocumentMatrix(BO)
#find associations with words by correlation with the findAssocs() function
findAssocs(dtm, c("country" , "american","nation", "freedom","love"), corlimit=0.60) # specifying a correlation limit of 0.60
## $country
## make away commitments home illness lives
## 0.72 0.63 0.63 0.63 0.63 0.63
## loss lucky matter medicaid medicare recognize
## 0.63 0.63 0.63 0.63 0.63 0.63
## reserved risks sap social sudden swept
## 0.63 0.63 0.63 0.63 0.63 0.63
## takers terrible
## 0.63 0.63
##
## $american
## ultimately need person single acting bring equip
## 0.66 0.66 0.66 0.66 0.63 0.63 0.63
## ever fidelity forces freedoms labs math militias
## 0.63 0.63 0.63 0.63 0.63 0.63 0.63
## muskets networks preserving research responses soldiers teachers
## 0.63 0.63 0.63 0.63 0.63 0.63 0.63
## times <U+0092>ll
## 0.63 0.63
##
## $nation
## numeric(0)
##
## $freedom
## king along bound declare evident falls
## 0.63 0.60 0.60 0.60 0.60 0.60
## footprints forebears guides hear inextricably left
## 0.60 0.60 0.60 0.60 0.60 0.60
## preacher selma seneca soul star stonewall
## 0.60 0.60 0.60 0.60 0.60 0.60
## sung unsung walk
## 0.60 0.60 0.60
##
## $love
## another anyone appalachia bright
## 1.00 1.00 1.00 1.00
## brothers cared cherished commit
## 1.00 1.00 1.00 1.00
## complete daughters detroit earn
## 1.00 1.00 1.00 1.00
## efforts engineers enlisted exercise
## 1.00 1.00 1.00 1.00
## expelled forced gay hills
## 1.00 1.00 1.00 1.00
## hopeful immigrants lanes living
## 1.00 1.00 1.00 1.00
## mothers newtown pioneers right
## 1.00 1.00 1.00 1.00
## sisters streets striving students
## 1.00 1.00 1.00 1.00
## treated truly vote wait
## 1.00 1.00 1.00 1.00
## welcome wives workforce journey
## 1.00 1.00 1.00 0.93
## equal applause land young
## 0.83 0.74 0.70 0.70
## began surely see quiet
## 0.70 0.70 0.70 0.70
## safe else citizen harm
## 0.70 0.70 0.70 0.70
## generation<U+0092>s
## 0.70
# convert dtm into a matrix
dtm<-as.matrix(dtm)
#dtm[1:10,1:10]
w <- sort(rowSums(dtm),decreasing=TRUE)
BO <- data.frame(word = names(w),freq=w)
head(BO, 10)
## word freq
## will will 39
## applause applause 25
## must must 24
## can can 20
## nation nation 18
## people people 18
## new new 17
## time time 16
## every every 15
## america america 14
rownames(BO)<-NULL
head(BO, 10)
## word freq
## 1 will 39
## 2 applause 25
## 3 must 24
## 4 can 20
## 5 nation 18
## 6 people 18
## 7 new 17
## 8 time 16
## 9 every 15
## 10 america 14
dtm<-subset(w,w>=5)
v<-dtm
#Create a barplot of the top words as used by President Walker Bush in his inaugural - Version I
barplot(dtm,las=2,col=rainbow(100))

# Print the ten most frequent word used in the addresses
BO <- data.frame(word = names(v),freq=v)
head(BO, 10)
## word freq
## will will 39
## applause applause 25
## must must 24
## can can 20
## nation nation 18
## people people 18
## new new 17
## time time 16
## every every 15
## america america 14
rownames(BO)<-NULL
head(BO, 10)
## word freq
## 1 will 39
## 2 applause 25
## 3 must 24
## 4 can 20
## 5 nation 18
## 6 people 18
## 7 new 17
## 8 time 16
## 9 every 15
## 10 america 14
#Create a barplot of the top words as used by President Walker Bush in his inaugural - Version II
barplot(BO[1:50,]$freq, las = 2, names.arg = BO[1:50,]$word,
col ="lightblue", main ="President Barack Obama's Most frequent inauguration words",
ylab = "Word frequencies")

# Sentimental Analysis
s<-get_nrc_sentiment(BOtext)
#View the sentimental analysis of the first 6 paragraphs
head(s)
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 0 0 1 0 1
## 2 0 0 0 0 0 0 0 0 0 0
## 3 0 1 1 1 1 2 1 5 1 7
## 4 0 0 0 0 0 0 0 0 0 0
## 5 2 3 1 1 2 0 0 3 2 6
## 6 0 0 0 0 0 0 0 0 0 0
#View the third paragraph
BOtext[3]
## [1] "I stand here today humbled by the task before us, grateful for the trust you have bestowed, mindful of the sacrifices borne by our ancestors. I thank President Bush for his service to our nation, as well as the generosity and cooperation he has shown throughout this transition."
#Check the sentimental analysis of some words in the third paragraph
get_nrc_sentiment("grateful")
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 0 0 0 0 1
get_nrc_sentiment("generosity")
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 1 0 0 1 0 1 1 0 1
#View the third paragraph
BOtext[9]
## [1] "These are the indicators of crisis, subject to data and statistics. Less measurable but no less profound is a sapping of confidence across our land; a nagging fear that America's decline is inevitable, and that the next generation must lower its sights."
#Check the sentimental analysis of some words in the third paragraph
get_nrc_sentiment("lower")
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 1 0 0 1 0
get_nrc_sentiment("decline")
## anger anticipation disgust fear joy sadness surprise trust negative positive
## 1 0 0 0 0 0 0 0 0 1 0
barplot(colSums(s),las=2,col=rainbow(10),
ylab='count',main="President Barak Obama's.inaugural sentimental plot")

# Create a word cloud of President Barack Obama's inaugural speeches
# print(BO)
set.seed(1234)
library(wordcloud2)
wordcloud(words = BO$word, freq = BO$freq, min.freq = 2,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

# Get sentiments using the four different lexicons
syuzhet <- get_sentiment(BOtext, method="syuzhet")
bing <- get_sentiment(BOtext, method="bing")
afinn <- get_sentiment(BOtext, method="afinn")
nrc <- get_sentiment(BOtext, method="nrc")
sentiments <- data.frame(syuzhet, bing, afinn, nrc)
# get the emotions using the NRC dictionary
emotions <- get_nrc_sentiment(BOtext)
emo_bar = colSums(emotions)
emo_sum = data.frame(count=emo_bar, emotion=names(emo_bar))
emo_sum$emotion = factor(emo_sum$emotion, levels=emo_sum$emotion[order(emo_sum$count, decreasing = TRUE)])
# plot the different sentiments from different methods
plot_ly(sentiments, x=~emotions, y=~syuzhet, type="scatter", mode="jitter", name="syuzhet") %>%
add_trace(y=~bing, mode="lines", name="bing") %>%
add_trace(y=~afinn, mode="lines", name="afinn") %>%
add_trace(y=~nrc, mode="lines", name="nrc") %>%
layout(title="President Barack Obama's inaugural speech sentiment",
yaxis=list(title="score"), xaxis=list(title="Emotions"))